#include <pthread.h>
#include <assert.h>
#include <err.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>
#include <sys/signal.h>
#include <fcntl.h>

#include <mach/mach.h>
#include <mach/mach_vm.h>

unsigned long *mapping = (void*)0x900000000UL;
mach_vm_address_t mapping2 = 0x900000000UL;

static const unsigned long magic_value = 0;

volatile int loops = 0;
void *thread_fn(void *dummy) {
  while (1) {
    loops++;
    unsigned long value = *(volatile unsigned long *)mapping;
    if (value != magic_value) {
      printf("saw 0x%016lx\n", value);
    }
  }
}

//void *thread_fn_dummy(void *dummy) { while (1); }

int main(void) {
  setbuf(stdout, NULL);
  if (mach_vm_allocate(mach_task_self(), &mapping2, 0x4000, VM_FLAGS_FIXED|VM_FLAGS_PURGABLE) != KERN_SUCCESS)
    errx(1, "mach_vm_allocate");
  assert(mapping2 == (unsigned long)mapping);
  mapping[1] = 1; // fault in

  pthread_t thread;
  //if (pthread_create(&thread, NULL, thread_fn_dummy, NULL))
  //  errx(1, "pthread_create");
  if (pthread_create(&thread, NULL, thread_fn, NULL))
    errx(1, "pthread_create");
  while (1) {
    usleep(1000);
    int state = VM_PURGABLE_EMPTY;
    int res;
    if ((res=mach_vm_purgable_control(mach_task_self(), mapping2, VM_PURGABLE_SET_STATE, &state)) != KERN_SUCCESS)
      errx(1, "mach_vm_purgable_control (set empty) = %d", res);
    usleep(1000);
    state = VM_PURGABLE_NONVOLATILE;
    if (mach_vm_purgable_control(mach_task_self(), mapping2, VM_PURGABLE_SET_STATE, &state) != KERN_SUCCESS)
      errx(1, "mach_vm_purgable_control (set nonvolatile)");
    mapping[1] = 1;
  }
}
=================

Unfortunately, the only Mac I have here is a Mac Mini, so I patched
the function pmap_flush() in XNU (version 4570.71.2) as follows to
make the same problem appear on machines with less cores (and also
added a bunch of printfs):

=================
        mp_disable_preemption();
 
        my_cpu = cpu_number();
        cpus_to_signal = pfc->pfc_cpus;
+       cpus_to_signal &= 0;
 
        PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
                            NULL, cpus_to_signal);
 
        for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
=================

With the patched kernel, I can occasionally observe the effects of
missing TLB flushes:

=================
$ gcc -o test test.c 
$ time ./test       
saw 0x0000000000000400
saw 0x00007fff64d472a8
saw 0xffffffffffffffff
^C

real    3m54.100s
user    3m53.462s
sys     0m4.032s
